from matplotlib import pyplot as plt
import scanpy as sc
import anndata as ad
import celltypist
from celltypist import models
import pandas as pd
import numpy as np
from scipy.spatial.distance import jensenshannon
import seaborn as sns
sample1 = sc.read_10x_mtx('samples/case1yf/')
sample2 = sc.read_10x_mtx('samples/case1zy/')
sample3 = sc.read_10x_mtx('samples/case2yf/')
sample4 = sc.read_10x_mtx('samples/case2zc/')
sample5 = sc.read_10x_mtx('samples/case2zy/')
sample6 = sc.read_10x_mtx('samples/case3yf/')
sample7 = sc.read_10x_mtx('samples/case3zy/')
sample8 = sc.read_10x_mtx('samples/case4zy/')
Goals: QC for each sample and doublet detection
Figures:
Discussion:
# create dictionary of samples
samples = {
"sample1": sample1,
"sample2": sample2,
"sample3": sample3,
"sample4": sample4,
"sample5": sample5,
"sample6": sample6,
"sample7": sample7,
"sample8": sample8}
# figure out number of genes and cells before filtering
for sample_name, sample_data in samples.items():
print(sample_name, " Number of genes: ", sample_data.var.shape[0])
print(sample_name, " Number of cells: ", sample_data.obs.shape[0])
sample1 Number of genes: 33538 sample1 Number of cells: 21560 sample2 Number of genes: 33538 sample2 Number of cells: 13685 sample3 Number of genes: 33538 sample3 Number of cells: 11786 sample4 Number of genes: 33538 sample4 Number of cells: 6606 sample5 Number of genes: 33538 sample5 Number of cells: 9406 sample6 Number of genes: 33538 sample6 Number of cells: 9379 sample7 Number of genes: 33538 sample7 Number of cells: 8837 sample8 Number of genes: 33538 sample8 Number of cells: 2024
# identify mitochondrial, ribosomal, and hemoglobin genes for each sample
for name, sample in samples.items():
sample.var["mt"] = sample.var_names.str.startswith("MT-")
sample.var["ribo"] = sample.var_names.str.startswith(("RPS", "RPL"))
sample.var["hb"] = sample.var_names.str.contains("^HB[^(P)]")
# calculate QC metrics
for name, sample in samples.items():
sc.pp.calculate_qc_metrics(
sample, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)
# QC plots for number unique genes per barcode, total number of molecules per barcode, and % reads that map to mitochondrial genome
for name, sample in samples.items():
print(name)
sc.pl.violin(
sample,
["n_genes_by_counts", "total_counts", "pct_counts_mt"],
jitter=0.4,
multi_panel=True,
)
... storing 'feature_types' as categorical
sample1
... storing 'feature_types' as categorical
sample2
... storing 'feature_types' as categorical
sample3
... storing 'feature_types' as categorical
sample4
... storing 'feature_types' as categorical
sample5
... storing 'feature_types' as categorical
sample6
... storing 'feature_types' as categorical
sample7
... storing 'feature_types' as categorical
sample8
# QC plots for joint visualization of above metrics
for name, sample in samples.items():
print(name)
sc.pl.scatter(sample, "total_counts", "n_genes_by_counts", color="pct_counts_mt")
sample1
sample2
sample3
sample4
sample5
sample6
sample7
sample8
Filtering based on thresholds found in paper:
Additionally, filtering out barcodes with less than 3 cells
# filtering
for name, sample in samples.items():
sc.pp.filter_cells(sample, min_genes=200)
sc.pp.filter_genes(sample, min_cells=3)
samples[name] = sample[sample.obs['pct_counts_mt'] < 20, :]
# doublet finding
for name, sample in samples.items():
sc.pp.scrublet(sample)
for sample_name, sample_data in samples.items():
print(sample_name, " Number of genes post-scrublet: ", sample_data.var.shape[0])
print(sample_name, " Number of cells post-scrublet: ", sample_data.obs.shape[0])
sample1 Number of genes post-scrublet: 21640 sample1 Number of cells post-scrublet: 8160 sample2 Number of genes post-scrublet: 21572 sample2 Number of cells post-scrublet: 8497 sample3 Number of genes post-scrublet: 21715 sample3 Number of cells post-scrublet: 11348 sample4 Number of genes post-scrublet: 18492 sample4 Number of cells post-scrublet: 6398 sample5 Number of genes post-scrublet: 22523 sample5 Number of cells post-scrublet: 9100 sample6 Number of genes post-scrublet: 22668 sample6 Number of cells post-scrublet: 8176 sample7 Number of genes post-scrublet: 22970 sample7 Number of cells post-scrublet: 7809 sample8 Number of genes post-scrublet: 16470 sample8 Number of cells post-scrublet: 1406
| Sample | # of Cells Before | # of Cells After | # of Genes Before | # of Genes After |
|---|---|---|---|---|
| Sample 1 | 21560 | 8160 | 33538 | 21640 |
| Sample 2 | 13685 | 8497 | 33538 | 21572 |
| Sample 3 | 11786 | 11348 | 33538 | 21715 |
| Sample 4 | 6606 | 6398 | 33538 | 18492 |
| Sample 5 | 9406 | 9100 | 33538 | 22523 |
| Sample 6 | 9379 | 8176 | 33538 | 22668 |
| Sample 7 | 8837 | 7809 | 33538 | 22970 |
| Sample 8 | 2024 | 1406 | 33538 | 16470 |
Discussion:
# combine samples
adata = ad.concat([samples["sample1"], samples["sample2"], samples["sample3"], samples["sample4"], samples["sample5"], samples["sample6"], samples["sample7"], samples["sample8"]],
join='outer', label="sample", keys=["sample1", "sample2", "sample3", "sample4", "sample5", "sample6", "sample7", "sample8"])
# copy over the counts matrix
adata.layers["counts"] = adata.X.copy()
# median scaling normalization
sc.pp.normalize_total(adata)
# log(x+1) transformation
sc.pp.log1p(adata)
adata
AnnData object with n_obs × n_vars = 60894 × 25870
obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'n_genes', 'doublet_score', 'predicted_doublet', 'sample'
uns: 'log1p'
layers: 'counts'
Goals: choose appropriate method for feature selection and number of highly variable features for downstream analysis
Figures:
Discussion:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, batch_key="sample")
adata
AnnData object with n_obs × n_vars = 60894 × 25870
obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'n_genes', 'doublet_score', 'predicted_doublet', 'sample'
var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
uns: 'log1p', 'hvg'
layers: 'counts'
sc.pl.highly_variable_genes(adata)
Figures:
Discussion:
# run PCA
sc.tl.pca(adata)
# visualize PCs
sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)
Goals: decide on appropriate clustering method and create visualization (UMAP/T-SNE), choose appropriate resolution for clustering algorithm
Figures:
Discussion:
# compute nearest neighbors graph based on first 30 PCs
sc.pp.neighbors(adata, n_pcs=30)
# call UMAP (unsupervised) clustering learning technique
sc.tl.umap(adata, random_state=42)
/projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
# visualize and label cell clusters
sc.tl.leiden(adata, flavor="igraph", n_iterations=2, resolution=0.8)
sc.pl.umap(adata, color=["leiden"], legend_loc='on data')
# visualize and label cell clusters by sample of origin
sc.pl.umap(
adata,
color="sample",
size=2
)
Figures:
Discussion:
sc.pl.umap(
adata,
color="sample",
size=2
)
pip install bbknn
Requirement already satisfied: bbknn in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (1.6.0) Requirement already satisfied: Cython in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from bbknn) (3.0.12) Requirement already satisfied: scipy>=1.6.0 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from bbknn) (1.15.2) Requirement already satisfied: numpy in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from bbknn) (2.2.5) Requirement already satisfied: pandas in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from bbknn) (2.2.3) Requirement already satisfied: annoy in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from bbknn) (1.17.3) Requirement already satisfied: pynndescent in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from bbknn) (0.5.13) Requirement already satisfied: umap-learn in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from bbknn) (0.5.7) Requirement already satisfied: scikit-learn>=1.0.2 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from bbknn) (1.5.2) Requirement already satisfied: joblib>=1.2.0 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from scikit-learn>=1.0.2->bbknn) (1.5.0) Requirement already satisfied: threadpoolctl>=3.1.0 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from scikit-learn>=1.0.2->bbknn) (3.6.0) Requirement already satisfied: python-dateutil>=2.8.2 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from pandas->bbknn) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from pandas->bbknn) (2025.2) Requirement already satisfied: tzdata>=2022.7 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from pandas->bbknn) (2025.2) Requirement already satisfied: six>=1.5 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas->bbknn) (1.17.0) Requirement already satisfied: numba>=0.51.2 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from pynndescent->bbknn) (0.61.2) Requirement already satisfied: llvmlite>=0.30 in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from pynndescent->bbknn) (0.44.0) Requirement already satisfied: tqdm in /projectnb/bf528/students/adale/.conda/envs/scanpy_scrnaseq_final/lib/python3.12/site-packages (from umap-learn->bbknn) (4.67.1) Note: you may need to restart the kernel to use updated packages.
%%time
sc.external.pp.bbknn(adata, batch_key="sample")
WARNING: consider updating your call to make use of `computation` CPU times: user 12.9 s, sys: 227 ms, total: 13.1 s Wall time: 13.2 s
sc.tl.umap(adata)
sc.pl.umap(adata, size=2, color="sample")
sc.tl.leiden(adata, flavor="igraph", n_iterations=2, resolution=0.8)
sc.pl.umap(adata, color=["leiden"], legend_loc='on data')
Figures:
Discussion:
# rank genes using Leiden clusters and Wilcoxon method
sc.tl.rank_genes_groups(adata, groupby="leiden", method="wilcoxon")
# initialize list to hold top marker dfs for each cluster
top_markers_dfs = []
# get cluster labels
cluster_names = adata.uns['rank_genes_groups']['names'].dtype.names
# loop through each cluster
for cluster in cluster_names:
# extract marker genes for cluster
df = sc.get.rank_genes_groups_df(adata, group=cluster)
# grab the top 5 markers
top_5 = df.head(5).copy() # .copy() to avoid SettingWithCopyWarning
# add column that indicates which cluster these marker genes represent
top_5['cluster'] = cluster
# add dataframe to list of dataframes
top_markers_dfs.append(top_5)
# concatenate all dataframes into one
top5markers = pd.concat(top_markers_dfs, axis=0)
# show table of top 5 marker genes for each cluster
pd.set_option('display.max_rows', None)
print(top5markers)
names scores logfoldchanges pvals pvals_adj cluster 0 TFF1 135.895401 5.891925 0.000000e+00 0.000000e+00 0 1 MUC1 135.809494 4.936536 0.000000e+00 0.000000e+00 0 2 AGR2 135.759903 5.093108 0.000000e+00 0.000000e+00 0 3 TFF3 135.001099 5.430393 0.000000e+00 0.000000e+00 0 4 STARD10 132.385391 4.143276 0.000000e+00 0.000000e+00 0 0 C19orf33 95.697174 3.421688 0.000000e+00 0.000000e+00 1 1 S100A6 91.898376 2.566411 0.000000e+00 0.000000e+00 1 2 KRT19 87.625374 3.157036 0.000000e+00 0.000000e+00 1 3 SPINT2 87.129570 2.953014 0.000000e+00 0.000000e+00 1 4 MAL2 84.686111 3.165730 0.000000e+00 0.000000e+00 1 0 IGKC 104.103470 6.587687 0.000000e+00 0.000000e+00 2 1 IGHA1 102.399521 6.122136 0.000000e+00 0.000000e+00 2 2 IGLC2 102.113052 6.031720 0.000000e+00 0.000000e+00 2 3 INS 101.132919 7.454987 0.000000e+00 0.000000e+00 2 4 PRSS2 99.939674 6.739924 0.000000e+00 0.000000e+00 2 0 GNG11 37.659233 5.978167 2.310245e-310 5.976604e-306 3 1 HSPG2 35.926765 4.868357 1.167337e-282 1.509951e-278 3 2 CALCRL 35.703964 7.862346 3.430942e-279 2.958616e-275 3 3 IFITM3 34.744625 2.967423 1.670796e-264 1.080588e-260 3 4 RAMP2 34.714046 9.179764 4.836606e-264 2.502460e-260 3 0 HLA-DRA 72.776703 3.922657 0.000000e+00 0.000000e+00 4 1 CD74 67.888695 3.084810 0.000000e+00 0.000000e+00 4 2 AIF1 66.369888 3.546318 0.000000e+00 0.000000e+00 4 3 HLA-DRB1 65.079491 3.008163 0.000000e+00 0.000000e+00 4 4 HLA-DPA1 63.726089 3.129188 0.000000e+00 0.000000e+00 4 0 FTL 122.270210 3.949274 0.000000e+00 0.000000e+00 5 1 APOE 120.356041 5.952228 0.000000e+00 0.000000e+00 5 2 CTSB 120.355240 4.404148 0.000000e+00 0.000000e+00 5 3 C1QA 119.656700 6.099206 0.000000e+00 0.000000e+00 5 4 CTSD 118.933044 4.145571 0.000000e+00 0.000000e+00 5 0 CCL5 100.206161 3.822394 0.000000e+00 0.000000e+00 6 1 MALAT1 97.267006 2.064209 0.000000e+00 0.000000e+00 6 2 PTPRC 87.483734 2.351960 0.000000e+00 0.000000e+00 6 3 CD3D 74.666801 2.792630 0.000000e+00 0.000000e+00 6 4 CD3G 67.723312 3.026016 0.000000e+00 0.000000e+00 6 0 MT1X 104.626373 4.985076 0.000000e+00 0.000000e+00 7 1 MT2A 102.434273 4.713325 0.000000e+00 0.000000e+00 7 2 CXCR4 82.812210 3.069840 0.000000e+00 0.000000e+00 7 3 BTG1 82.729111 2.395216 0.000000e+00 0.000000e+00 7 4 MT1E 78.785507 3.720927 0.000000e+00 0.000000e+00 7 0 NKG7 70.465973 5.036767 0.000000e+00 0.000000e+00 8 1 CCL5 60.124821 3.951833 0.000000e+00 0.000000e+00 8 2 KLRD1 58.788811 5.017536 0.000000e+00 0.000000e+00 8 3 GZMA 51.213333 3.601730 0.000000e+00 0.000000e+00 8 4 KLRB1 51.010963 3.583180 0.000000e+00 0.000000e+00 8 0 IL7R 102.098984 3.868607 0.000000e+00 0.000000e+00 9 1 BTG1 91.339149 2.280180 0.000000e+00 0.000000e+00 9 2 CXCR4 85.819328 2.739116 0.000000e+00 0.000000e+00 9 3 TSC22D3 84.562920 2.375976 0.000000e+00 0.000000e+00 9 4 TNFAIP3 83.100296 2.562305 0.000000e+00 0.000000e+00 9 0 FTH1 69.748932 3.863620 0.000000e+00 0.000000e+00 10 1 CXCL8 69.533867 7.225544 0.000000e+00 0.000000e+00 10 2 NAMPT 64.931732 5.004308 0.000000e+00 0.000000e+00 10 3 SRGN 61.433968 3.749251 0.000000e+00 0.000000e+00 10 4 BCL2A1 60.468006 5.511665 0.000000e+00 0.000000e+00 10 0 SPP1 43.308453 3.829355 0.000000e+00 0.000000e+00 11 1 APOC1 43.005383 3.464934 0.000000e+00 0.000000e+00 11 2 APOE 42.754692 3.501902 0.000000e+00 0.000000e+00 11 3 LYZ 41.477932 3.108666 0.000000e+00 0.000000e+00 11 4 FTL 40.992519 2.178048 0.000000e+00 0.000000e+00 11 0 TPSB2 53.052387 9.620810 0.000000e+00 0.000000e+00 12 1 CPA3 52.246220 10.258164 0.000000e+00 0.000000e+00 12 2 TPSAB1 52.233009 10.373271 0.000000e+00 0.000000e+00 12 3 LTC4S 46.493904 5.956984 0.000000e+00 0.000000e+00 12 4 IL1RL1 46.068459 7.578917 0.000000e+00 0.000000e+00 12 0 IGFBP7 116.031326 6.633002 0.000000e+00 0.000000e+00 13 1 CALD1 115.179062 7.795003 0.000000e+00 0.000000e+00 13 2 BGN 113.616379 7.574769 0.000000e+00 0.000000e+00 13 3 COL1A2 112.177513 8.250024 0.000000e+00 0.000000e+00 13 4 COL1A1 112.168350 8.102842 0.000000e+00 0.000000e+00 13 0 CD37 38.969078 3.282210 0.000000e+00 0.000000e+00 14 1 MS4A1 37.874260 7.498545 0.000000e+00 0.000000e+00 14 2 CD74 37.072380 3.009670 7.830114e-301 6.752169e-297 14 3 HLA-DRA 36.909210 3.449338 3.288498e-298 2.126836e-294 14 4 RPS8 36.561207 1.559842 1.183424e-292 6.123035e-289 14
Goals: Assign preliminary labels for cell identities of clusters with your choice of tool
Figures:
Discussion:
adata_celltypist = adata.copy()
adata_celltypist.obs_names_make_unique()
adata_celltypist.var_names_make_unique()
predictions = celltypist.annotate(adata_celltypist, model = 'Immune_All_High.pkl', majority_voting = True)
⚠️ Warning: invalid expression matrix, expect ALL genes and log1p normalized expression to 10000 counts per cell. The prediction result may not be accurate 🔬 Input data has 60894 cells and 25870 genes 🔗 Matching reference genes in the model 🧬 5848 features used for prediction ⚖️ Scaling input data 🖋️ Predicting labels ✅ Prediction done! 👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it ⛓️ Over-clustering input data with resolution set to 20 🗳️ Majority voting the predictions ✅ Majority voting done!
predictions.predicted_labels.head()
| predicted_labels | over_clustering | majority_voting | |
|---|---|---|---|
| AAACCCAAGCTAATCC-1 | Epithelial cells | 76 | Epithelial cells |
| AAACCCAAGGCTGTAG-1 | T cells | 75 | Epithelial cells |
| AAACCCACAAACAGGC-1 | Epithelial cells | 108 | Epithelial cells |
| AAACCCAGTACTCGAT-1 | Epithelial cells | 213 | Epithelial cells |
| AAACCCAGTGCTGCAC-1 | Epithelial cells | 62 | Epithelial cells |
adata_celltypist.obs = adata_celltypist.obs.join(predictions.predicted_labels)
sc.tl.umap(adata_celltypist)
sc.pl.umap(adata_celltypist, color = ['majority_voting'], size = 2)
Goals: Search through the literature using our marker gene analysis along with the automatic annotation algorithm to provide final determinations of the cell identities for each cluster.
Figures:
Discussion:
# create dotplot
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5)
WARNING: dendrogram data not found (using key=dendrogram_leiden). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
# find clusters with most cells
adata.obs['leiden'].value_counts()
leiden 6 8233 0 8073 9 6587 1 6091 5 5965 13 5276 2 5228 7 4744 4 3031 8 2259 10 1890 11 1136 12 966 14 766 3 649 Name: count, dtype: int64
top3_names = ['6','0','9']
adata_top3 = adata[adata.obs['leiden'].isin(top3_names)].copy()
sc.pl.rank_genes_groups_dotplot(adata_top3[adata_top3.obs['leiden'] == '6'], n_genes=5)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
sc.pl.rank_genes_groups_dotplot(adata_top3[adata_top3.obs['leiden'] == '0'], n_genes=5)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
sc.pl.rank_genes_groups_dotplot(adata_top3[adata_top3.obs['leiden'] == '9'], n_genes=5)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
# create dictionary of manual labels based on Leiden clusters
manual_labels = {
'0' : "Goblet Cells",
'1' : "Epithelial Cells",
'2': "Plasma Cells",
'3': "Endothelial Cells",
'4': "Macrophages",
'5': "Macrophages",
'6': "T Cells",
'7': "T Cells",
'8': "Innate Lymphoid Cells",
'9': "T Cells",
'10': "Monocytes",
'11': "Macrophages",
'12': "Mast Cells",
'13': "Fibroblasts",
'14': "B Cells"
}
# map manual labels onto data
adata_celltypist.obs['manual'] = adata_celltypist.obs['leiden'].map(manual_labels)
# visualize with manual labels
sc.pl.umap(adata_celltypist, color = ['manual'], size = 2, legend_loc="on data", legend_fontsize=8, legend_fontweight="normal")
... storing 'manual' as categorical
This section was supposed to include recreations of two of the paper's major findings, either: Pseudotime Analysis, RNA Velocity, Cell Proportion analyis, or Cell Signaling Analysis. After trouble shooting for several hours, I wasn't able to recreate these analyses for two reasons. The first reason was that the researchers didn't include the data to be able to reproduce their Pseudotime Analysis or RNA Velocity. The second reason was that with the recommended packages to carry out additional analyses there were package dependencies that the SCC does not have installed. As an alternative, I tried to recreate a figure similar to 2F and 2G. Additionally, I calculated the proportions of each manually labeled cell type for each sample and tissue type the samples were collected from (primary pancreatic tumors (PT), hepatic metastases (HM), and normal pancreatic tissue (PT)). After that, I tried to calculate JSD scores and make a similar rendering of figure 1F.
Figure 2F/G Recreation
For this figure, I created a heatmap of the average gene expression for the top gene markers for the manual cell type labels across the different tissue sources. The main observation is that PTPRC appears to have significantly elevated expression levels in cancerous tissue samples compared to normal pancreatic tissue. PTPRC, otherwise known as CD45, has been shown to have an important impact in cancer development and is a typical drug target for cancer treatment. Our observation aligns with the results of other studies that have observed enhanced exprssion od CD45 in tumor samples (Ye et al. 2022).
Cell Type Proportions Across Samples and Tissue Types
In this investigation I was interested in observing which cell types show up the most in each sample and each tissue. Case2ZC is our only sample from normal pancreatic tissue. In normal tissue we are likely to see less immune specific cells such as T Cells, B Cells, mast cells and monocytes since there is not an active immune attack on the body. In cancerous tissue, we observe a larger variety of immune cells that play critical roles in tumor microenvironments (TME). Across cancerous tissue samples, we see an increase in the proportion of macrophages, T Cells, and endothelial cells which are some of the main cellular components of TMEs (Tie et al. 2022.)).
JSD Analysis
JSD allows us to understand differences between the two distributions of two different populations. In this context, I used JSD to understand the expression differences in the various cell types across the two cancerous tissue types. This calculation can allow us to quantify differences in the molecular composition of the two tissue samples (Everaert et al. 2020). Overall, the JSD values were fairly low between the tissue types, with a greater divergence in PT than HM.
# dicitonary mapping actual sample names to sample replicate identities
sample_names = {
"sample1" : "CASE1YF",
"sample2" : "CASE1ZY",
"sample3" : "CASE2YF",
"sample4" : "CASE2ZC",
"sample5" : "CASE2ZY",
"sample6" : "CASE3YF",
"sample7" : "CASE3ZY",
"sample8" : "CASE4ZY"
}
# dictionary mapping tissue origins to samples
sample_groups = {
"CASE1YF" : "PT",
"CASE1ZY" : "HM",
"CASE2YF" : "PT",
"CASE2ZC" : "NT",
"CASE2ZY" : "HM",
"CASE3YF" : "PT",
"CASE3ZY" : "HM",
"CASE4ZY" : "HM"
}
# map sample names and tissue types onto the observation data
adata_celltypist.obs['sample_names'] = adata_celltypist.obs['sample'].map(sample_names)
adata_celltypist.obs['sample_groups'] = adata_celltypist.obs['sample_names'].map(sample_groups)
adata_celltypist.obs.head()
| n_genes_by_counts | log1p_n_genes_by_counts | total_counts | log1p_total_counts | pct_counts_in_top_50_genes | pct_counts_in_top_100_genes | pct_counts_in_top_200_genes | pct_counts_in_top_500_genes | total_counts_mt | log1p_total_counts_mt | ... | doublet_score | predicted_doublet | sample | leiden | predicted_labels | over_clustering | majority_voting | manual | sample_names | sample_groups | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACCCAAGCTAATCC-1 | 4845 | 8.485909 | 20821.0 | 9.943766 | 32.635320 | 40.881802 | 49.824696 | 61.942270 | 3436.0 | 8.142354 | ... | 0.176301 | False | sample1 | 0 | Epithelial cells | 76 | Epithelial cells | Goblet Cells | CASE1YF | PT |
| AAACCCAAGGCTGTAG-1 | 3575 | 8.182000 | 12284.0 | 9.416134 | 27.727125 | 37.577336 | 48.632367 | 63.082058 | 700.0 | 6.552508 | ... | 0.040038 | False | sample1 | 0 | T cells | 75 | Epithelial cells | Goblet Cells | CASE1YF | PT |
| AAACCCACAAACAGGC-1 | 1772 | 7.480428 | 4832.0 | 8.483223 | 30.132450 | 43.687914 | 56.436258 | 71.750828 | 186.0 | 5.231109 | ... | 0.328283 | False | sample1 | 2 | Epithelial cells | 108 | Epithelial cells | Plasma Cells | CASE1YF | PT |
| AAACCCAGTACTCGAT-1 | 3975 | 8.288032 | 16395.0 | 9.704793 | 30.186032 | 41.957914 | 53.888381 | 66.813053 | 1615.0 | 7.387709 | ... | 0.031746 | False | sample1 | 0 | Epithelial cells | 213 | Epithelial cells | Goblet Cells | CASE1YF | PT |
| AAACCCAGTGCTGCAC-1 | 3634 | 8.198364 | 13037.0 | 9.475623 | 28.710593 | 39.786761 | 51.024009 | 64.539388 | 1075.0 | 6.981006 | ... | 0.017869 | False | sample1 | 0 | Epithelial cells | 62 | Epithelial cells | Goblet Cells | CASE1YF | PT |
5 rows × 28 columns
# extract list of top5markers for each cluster
marker_genes = top5markers['names'].tolist()
# instantiate expression matrix with sample group (columns) and marker gene (rows) labels
expr_matrix = pd.DataFrame(
columns=adata_celltypist.obs['sample_groups'].unique(),
index=marker_genes
)
# add average expression values to matrix
for group in expr_matrix.columns:
group_mask = adata_celltypist.obs['sample_groups'] == group
group_means = np.array(adata_celltypist[group_mask, marker_genes].X.mean(axis=0)).flatten()
expr_matrix[group] = group_means
# map genes for each cluster onto the matrix to get the clusters
gene_to_cluster = top5markers.set_index('names')['cluster'].to_dict()
expr_matrix['leiden'] = expr_matrix.index.map(gene_to_cluster)
# map manual cell labels onto the clusters
expr_matrix['cells'] = expr_matrix['leiden'].map(manual_labels)
# reset index to allow the genes to be a column
expr_matrix = expr_matrix.reset_index()
# rename column
expr_matrix.rename(columns={"index": "gene"}, inplace=True)
expr_matrix.head(15)
| gene | PT | HM | NT | leiden | cells | |
|---|---|---|---|---|---|---|
| 0 | TFF1 | 0.934727 | 0.829021 | 0.212076 | 0 | Goblet Cells |
| 1 | MUC1 | 0.673725 | 0.512330 | 0.389904 | 0 | Goblet Cells |
| 2 | AGR2 | 0.732537 | 0.478116 | 0.188773 | 0 | Goblet Cells |
| 3 | TFF3 | 0.910009 | 0.568599 | 0.141552 | 0 | Goblet Cells |
| 4 | STARD10 | 0.399791 | 0.285614 | 0.153858 | 0 | Goblet Cells |
| 5 | C19orf33 | 0.318430 | 0.310597 | 0.345247 | 1 | Epithelial Cells |
| 6 | S100A6 | 2.196421 | 2.271431 | 3.061669 | 1 | Epithelial Cells |
| 7 | KRT19 | 0.715776 | 0.626178 | 1.754007 | 1 | Epithelial Cells |
| 8 | SPINT2 | 0.363668 | 0.299812 | 0.551749 | 1 | Epithelial Cells |
| 9 | MAL2 | 0.355712 | 0.246315 | 0.259628 | 1 | Epithelial Cells |
| 10 | IGKC | 0.146011 | 0.101022 | 3.100226 | 2 | Plasma Cells |
| 11 | IGHA1 | 0.067331 | 0.039209 | 2.191011 | 2 | Plasma Cells |
| 12 | IGLC2 | 0.118611 | 0.043537 | 2.390383 | 2 | Plasma Cells |
| 13 | INS | 0.005352 | 0.000000 | 3.310904 | 2 | Plasma Cells |
| 14 | PRSS2 | 0.035995 | 0.016204 | 2.587812 | 2 | Plasma Cells |
# set up data for heatmap
heatmap_data = expr_matrix.set_index("gene")[["PT", "HM", "NT"]]
# create heatmap
plt.figure(figsize=(4, 8))
sns.heatmap(
heatmap_data,
cmap="viridis", # Or "coolwarm", "magma", etc.
cbar_kws={'label': 'Expression Level'}
)
plt.title("Gene Expression Heatmap of Top Marker Genes per Cell Cluster Across Tissue Types", wrap=True)
plt.xlabel("Tissue Type")
plt.ylabel("Top Marker Genes")
plt.show()
# grab cell counts
cell_counts = adata_celltypist.obs.groupby(['sample_names', 'sample_groups'])['manual'].value_counts().unstack(fill_value=0)
cell_counts = cell_counts.reset_index()
cell_counts.columns.name = None
# calculate proportions
props = cell_counts.iloc[:, 2:]
col_props = props.div(props.sum(axis=0), axis=1)
cols = pd.concat([cell_counts.iloc[:,0:2], col_props], axis=1)
df = cols.loc[~(cols.select_dtypes(include='number') == 0).all(axis=1)]
df
| sample_names | sample_groups | B Cells | Endothelial Cells | Epithelial Cells | Fibroblasts | Goblet Cells | Innate Lymphoid Cells | Macrophages | Mast Cells | Monocytes | Plasma Cells | T Cells | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | CASE1YF | PT | 0.005222 | 0.049307 | 0.124118 | 0.097991 | 0.650687 | 0.158477 | 0.047079 | 0.149068 | 0.001587 | 0.014537 | 0.027602 |
| 3 | CASE1ZY | HM | 0.086162 | 0.123267 | 0.073059 | 0.014405 | 0.287254 | 0.086321 | 0.285136 | 0.015528 | 0.184656 | 0.009564 | 0.102893 |
| 8 | CASE2YF | PT | 0.169713 | 0.215716 | 0.017403 | 0.572403 | 0.000372 | 0.142098 | 0.138670 | 0.503106 | 0.009524 | 0.024675 | 0.285729 |
| 10 | CASE2ZC | NT | 0.001305 | 0.052388 | 0.133804 | 0.042835 | 0.000000 | 0.000443 | 0.074319 | 0.006211 | 0.000000 | 0.872609 | 0.000000 |
| 12 | CASE2ZY | HM | 0.203655 | 0.520801 | 0.065835 | 0.013078 | 0.003221 | 0.401948 | 0.168476 | 0.002070 | 0.690476 | 0.025822 | 0.207166 |
| 17 | CASE3YF | PT | 0.382507 | 0.013867 | 0.306682 | 0.102540 | 0.046080 | 0.036299 | 0.074418 | 0.091097 | 0.020635 | 0.021232 | 0.205428 |
| 18 | CASE3ZY | HM | 0.066580 | 0.016949 | 0.207848 | 0.156558 | 0.012139 | 0.114210 | 0.201145 | 0.232919 | 0.048677 | 0.021614 | 0.144705 |
| 21 | CASE4ZY | HM | 0.084856 | 0.007704 | 0.071253 | 0.000190 | 0.000248 | 0.060204 | 0.010758 | 0.000000 | 0.044444 | 0.009946 | 0.026477 |
# set up data for bar plot
df_plot = df.set_index('sample_names')
groups = df_plot.pop('sample_groups')
# create bar plot
df_plot.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.ylabel('Proportion')
plt.xlabel('Sample')
plt.xticks(rotation=45, ha='right')
plt.title('Cell Type Proportions by Sample')
plt.legend(title='Cell Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
df_groups = df.set_index('sample_groups')
names = df_groups.pop('sample_names')
df_grouped = df_groups.groupby(df_groups.index).sum()
df_grouped
| B Cells | Endothelial Cells | Epithelial Cells | Fibroblasts | Goblet Cells | Innate Lymphoid Cells | Macrophages | Mast Cells | Monocytes | Plasma Cells | T Cells | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| sample_groups | |||||||||||
| HM | 0.441253 | 0.668721 | 0.417994 | 0.184230 | 0.302861 | 0.662683 | 0.665515 | 0.250518 | 0.968254 | 0.066947 | 0.481241 |
| NT | 0.001305 | 0.052388 | 0.133804 | 0.042835 | 0.000000 | 0.000443 | 0.074319 | 0.006211 | 0.000000 | 0.872609 | 0.000000 |
| PT | 0.557441 | 0.278891 | 0.448202 | 0.772934 | 0.697139 | 0.336875 | 0.260166 | 0.743271 | 0.031746 | 0.060444 | 0.518759 |
df_grouped.plot(kind='bar', stacked=True, figsize=(10, 6))
# Step 3: Customize
plt.ylabel('Proportion')
plt.xlabel('Tissue Type')
plt.xticks(rotation=45, ha='right')
plt.title('Cell Type Proportions by Tissue Type')
plt.legend(title='Cell Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
# define JSD function
def compute_jsd(p, q):
# Normalize distributions to sum to 1
p = p / np.sum(p)
q = q / np.sum(q)
return jensenshannon(p, q, base=2) # base=2 gives values between 0 and 1
# create expression values df
expr_df = pd.DataFrame(adata_celltypist.X.toarray(), index=adata_celltypist.obs_names, columns=adata_celltypist.var_names)
# cell type labels
expr_df['manual'] = adata_celltypist.obs['manual'].values
expr_df['sample_groups'] = adata_celltypist.obs['sample_groups'].values
# mean expression per gene per cell type
mean_expr_by_type = expr_df.groupby(['sample_groups','manual']).mean()
# global average across all cells
global_mean_expr = expr_df.drop(columns=['manual', 'sample_groups']).mean()
# JSD between each cell type and the global mean
jsd_values = mean_expr_by_type.apply(lambda row: compute_jsd(row.values, global_mean_expr.values), axis=1)
# set up data
jsd_results = pd.DataFrame(jsd_values)
jsd_results.columns = ['JSD']
jsd_df = jsd_results.reset_index()
jsd_df = jsd_df[jsd_df['sample_groups'] != 'NT']
# create figure
plt.figure(figsize=(10, 6))
sns.barplot(data=jsd_df, x='manual', y='JSD', hue='sample_groups')
plt.xticks(rotation=45, ha='right')
plt.ylabel("Jensen-Shannon Divergence")
plt.xlabel("Cell Type")
plt.title("JSD per Cell Type by Sample Group")
plt.tight_layout()
plt.show()